import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import sklearn
In [2]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
In [3]:
df = pd.read_csv('./Data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()
Out[3]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
In [5]:
df.describe()
Out[5]:
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 ... 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 ... 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 ... 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 ... 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 ... 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 ... 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 ... 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 ... 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000

8 rows × 26 columns

In [6]:
df.columns
print(df.columns)
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

employee attrition value

In [7]:
df['Attrition'].value_counts().plot(kind='bar', backend='plotly', title='Attrition')

employee attrition %

In [8]:
attrition_counts = df["Attrition"].value_counts()
yes_percent = attrition_counts["Yes"] / (
    attrition_counts["Yes"] + attrition_counts["No"]
)
no_percent = attrition_counts["No"] / (attrition_counts["Yes"] + attrition_counts["No"])

fig = go.Figure(
    data=[
        go.Pie(
            labels=["Yes", "No"],
            values=[yes_percent, no_percent],
        ),
    ]
)
fig.show()

gender analysis

In [9]:
# average age of genders in the dataset
average_age_by_gender = df.groupby('Gender')['Age'].mean()
print(average_age_by_gender)
Gender
Female    37.329932
Male      36.653061
Name: Age, dtype: float64

age distribution

In [10]:
from plotly.subplots import make_subplots

# Create subplots: 2 rows, 2 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Female Employees", "Male Employees", "Overall Employees"))

# Female employees
female_df = df[df['Gender'] == 'Female']
female_hist = px.histogram(female_df, x='Age')
female_hist.update_traces(showlegend=True)
mean_age_female = female_df['Age'].mean()

# Male employees
male_df = df[df['Gender'] == 'Male']
male_hist = px.histogram(male_df, x='Age')
male_hist.update_traces(showlegend=True)
mean_age_male = male_df['Age'].mean()

# Overall employees
overall_hist = px.histogram(df, x='Age')
overall_hist.update_traces(showlegend=True)
mean_age_overall = df['Age'].mean()

# Add traces to subplots
for trace in female_hist.data:
    fig.add_trace(trace, row=1, col=1)
    fig.update_xaxes(title_text="Age", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.add_vline(x=mean_age_female, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=1, col=1)

for trace in male_hist.data:
    fig.add_trace(trace, row=1, col=2)
    fig.update_xaxes(title_text="Age", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.add_vline(x=mean_age_male, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=1, col=2)

for trace in overall_hist.data:
    fig.add_trace(trace, row=2, col=1)
    fig.update_xaxes(title_text="Age", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=1)
    fig.add_vline(x=mean_age_overall, line_width=2, line_dash="dash", line_color="red", annotation_text="Mean Age", annotation_position="top right", row=2, col=1)

# Update layout
fig.update_layout(title_text="Age Distribution of Employees", height=800)

fig.show()

distribution of job satisfaction

In [11]:
df.groupby(['Gender', 'JobSatisfaction'])['JobSatisfaction'].count()
Out[11]:
Gender  JobSatisfaction
Female  1                  119
        2                  118
        3                  181
        4                  170
Male    1                  170
        2                  162
        3                  261
        4                  289
Name: JobSatisfaction, dtype: int64
In [12]:
# Group by Gender and JobSatisfaction and count the occurrences
grouped_df = (
    df.groupby(["Gender", "JobSatisfaction"])["JobSatisfaction"]
    .count()
    .reset_index(name="Count")
)

# Create a bar plot
fig = px.bar(
    grouped_df,
    x="JobSatisfaction",
    y="Count",
    color="Gender",
    barmode="group",
    title="Job Satisfaction by Gender",
)

fig.show()

Monthly Income by Gender

In [13]:
fig = px.strip(
    df,
    x='Gender',
    y='MonthlyIncome',
    title='Disparity in Monthly Income',
    hover_data=['MonthlyIncome', 'JobSatisfaction'],
    color='Gender'
)
fig.show()

Average Income and Presence by Department

In [14]:
# Calculate the mean monthly income by gender
mean_income = df.groupby('Gender')['MonthlyIncome'].mean().reset_index()

# Create a bar plot
fig = px.bar(mean_income, x='Gender', y='MonthlyIncome', title='Average Monthly Income by Gender', color='Gender')
fig.show()
In [15]:
grouped_df = df.groupby(['Gender', 'Department'])['Department'].count().reset_index(name='Count')

# Create a bar plot
fig = px.bar(
    grouped_df,
    x="Department",
    y="Count",
    color="Gender",
    barmode="group",
    title="Number of Employees by Department",
)

fig.show()